Built with R version:
3.4.2


Built with R version:
3.4.2

Libraries

Load necessary libraries

library(affy)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 1.17.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://bioconductor.org/packages/ComplexHeatmap/
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## ========================================
library(plot3D)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(circlize)
## ========================================
## circlize version 0.4.3
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization 
##   in R. Bioinformatics 2014.
## ========================================
library(AnnotationDbi)
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following object is masked from 'package:base':
## 
##     expand.grid
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(lattice)
library(org.Hs.eg.db)
## 
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
library(RColorBrewer)
library(AnnotationDbi)
library(rglwidget)
## The functions in the rglwidget package have been moved to rgl.
library(VennDiagram)
## Loading required package: futile.logger
library(org.Hs.eg.db)
library(GenomicRanges)
## Loading required package: GenomeInfoDb
library(GenomicFeatures)
library(rtracklayer)
library(biomaRt)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(survival)
library(Hmisc)
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:AnnotationDbi':
## 
##     contents
## The following object is masked from 'package:Biobase':
## 
##     contents
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(ConsensusClusterPlus)
library(pheatmap)
library(ggplot2)
library(heatmap.plus)
library(rgl)
library("tmod")

set1 = c(brewer.pal(9,"Set1"), brewer.pal(8, "Dark2"))

violinJitter <- function(x, magnitude=1){
  d <- density(x)
  data.frame(x=x, y=runif(length(x),-magnitude/2, magnitude/2) * approxfun(d$x, d$y)(x))
}

rotatedLabel <- function(x0 = seq_along(labels), y0 = rep(par("usr")[3], length(labels)), labels, pos = 1, cex=1, srt=45, ...) {
  w <- strwidth(labels, units="user", cex=cex)
  h <- strheight(labels, units="user",cex=cex)
  u <- par('usr')
  p <- par('plt')
  f <- par("fin")
  xpd <- par("xpd")
  par(xpd=NA)
  text(x=x0 + ifelse(pos==1, -1,1) * w/2*cos(srt/360*2*base::pi), y = y0 + ifelse(pos==1, -1,1) * w/2 *sin(srt/360*2*base::pi) * (u[4]-u[3])/(u[2]-u[1]) / (p[4]-p[3]) * (p[2]-p[1])* f[1]/f[2] , labels, las=2, cex=cex, pos=pos, adj=1, srt=srt,...)
  par(xpd=xpd)
}


avefc = function (y, log=TRUE, replace= FALSE) {
     if (log) y = 2^y
   if (replace) y = y + (1-min(y))
   m = apply(y,1,mean)
     y.n = y/m  
     y.n2 = y.n
     y.n2 [y.n2 < 1] = 1/ (y.n2 [y.n2 < 1])
     ave.fc = apply (y.n2, 1, mean)
     return(ave.fc)
     }

Ensembl Library

For gene convertion from array to HUGO

ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" )

Gene Expression Data

Upload or generate GEP normalized matrix

### choice 1: import processed matrix

setwd("~/Desktop/PTCL/GEP_PTCL_NOS/afinal_data_set/files/")
load("541_PTCL_batch_adjusted_geo.id.Rdata")

geneExpr = adj.data
# import batch and re-order accordingly
load("PTCL.batch.Rdata")
batch = batch [order(batch$nameNEW),]
batch.series = as.vector(batch$center)

### end of choice 1

### choice 2: generate your own affy object and custom data

# download CEL files from GEO series GSE6338, GSE19067, GSE19069, GSE40160, GSE58445, GSE65823 and EBI series ETABM702, ETABM783
# GSM368580.CEL, GSM368582.CEL, GSM368584.CEL, GSM368586.CEL, GSM368589.CEL, GSM368591.CEL, GSM368594.CEL, GSM472164.CEL, GSM1411278.CEL, GSM1411284.CEL, GSM1411285.CEL, GSM1411287.CEL, GSM1411355.CEL, GSM1411364.CEL, GSM1411368.CEL, GSM1411425.CEL, GSM1411427.CEL excluded from the analysis (see Methods for explaination")
### celfiles <- dir("~/Documents/DATI/PTCL.nos/GSE6338-GSE19067-GSE19069-GSE40160-GSE58445-GSE65823-ETABM702-ETABM783/", pattern = ".CEL")
### library(affy)
### gset = justRMA(celfile.path = "/Users/emagene/Documents/DATI/PTCL.nos/GSE6338-GSE19067-GSE19069-GSE40160-GSE58445-GSE65823-ETABM702-ETABM783/", ### filenames = celfiles, sampleNames = gsub(".CEL","", celfiles), cdfname = "hgu133plus2hsentrezgcdf")
### geneExpr = exprs(gset)
### batch adjustment
### library(sva)  
### # import batch and re-order accordingly
### load("./Rmd.files/PTCL.batch.Rdata")
### batch = batch [order(rownames(batch)),]
### batch.series = as.vector(batch$center)
### geneExprNEW = geneExpr [ , order(colnames(geneExpr)) ]
### geneExprNEW = geneExprNEW[grep("AFFX",rownames(geneExprNEW), invert=TRUE),]
### # check order correspondence and, if correct, adjust data
### if (all(colnames(geneExprNEW) == rownames(batch))) {
###   adj.data = ComBat (geneExprNEW, batch.series, mod = NULL, par.prior = TRUE, prior.plots = TRUE)
### } else {
###   cat("Error: colnames and batch did not correspond")
### }
### geneExpr = adj.data
### colnames(geneExpr) = as.vector(batch$nameNEW)
### end of choice 2

Clinical Data

Upload paz info with clinical and mutational data

pts.info.data <- read.table("~/Desktop/PTCL/GEP_PTCL_NOS/sept_2017_dropbox_luca/script (1)/Rmd.files/541_paz_info_MUT.txt", sep="\t", header=TRUE, check.names=FALSE, stringsAsFactors = F) 
# customize colors for categories
levels(as.factor(pts.info.data$final.molec))
##  [1] "AITL"     "ALCL.neg" "ALCL.pos" "ATLL"     "NKT"      "PTCL.nos"
##  [7] "T.CD30"   "T.CD4"    "T.CD8"    "T.DR"     "T.reg"    "TCR-HL"
# "AITL"     "ALCL.neg" "ALCL.pos" "ATLL"     "NKT"      "PTCL.nos" "T.CD30"   "T.CD4"    "T.CD8"    "T.DR"     "T.reg"    "TCR-HL"  
colorz = c("black", "yellow","dodgerblue2","brown2","darkorchid1", "orange", "grey42", "grey52","grey62","grey72","grey82","grey92")
temp = split (  pts.info.data$sample.nameNEW, pts.info.data$final.molec )
colorx = colnames(geneExpr)
length(colorz)
## [1] 12
length(temp)
## [1] 12
for (i in 1:length(colorz)) colorx [ which(colorx %in% unlist(temp[i])) ] = colorz[i]
library(gplots)
colorx = col2hex(colorx)    

Pie Chart with Percentages

slices <- table(pts.info.data$final.molec)
lbls <- names(table(pts.info.data$final.molec))
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, ": ", slices, " (", pct, "%)", sep="" ) # add percents to labels 
par(mfrow=c(1,1))
par(mar=c(3,3,3,3), xpd=F)
pie(slices,labels = lbls, init.angle = 0, col=colorz, main="", cex=0.6, radius=0.8)

PCA

# apply variational filter

afc2 = avefc(geneExpr, log=TRUE, replace=FALSE)
data541exprs.vf = geneExpr [afc2 >= 2, ]
# retry PCA on shorted gene list
data541m = t(as.matrix(data541exprs.vf))
pca<-prcomp(data541m,scale=T)
# mfrow3d(nr = 1, nc = 1, sharedMouse = T)  
# plot3d(pca$x,rgl.use=F,col=colorx,size=0.6,type="s")
# rglwidget()

Heatmap

mat = as.matrix(data541exprs.vf)
base_mean = rowMeans(mat)
mat_scaled = t(apply(mat, 1, scale))
types = pts.info.data$final.molec
color.annot = col2hex(colorz); names (color.annot)= names(temp)  
ha = HeatmapAnnotation(df = data.frame(type = types) , col = list(type = c( color.annot ) ) )
ha@anno_list[[1]]@color_mapping@colors = col2hex(colorz)
names(ha@anno_list[[1]]@color_mapping@colors) = names(temp)
ht = Heatmap(mat_scaled, name = "expression", km = 7, clustering_method_columns = "ward.D", col = colorRamp2(c(-1, 0, 1), c("green", "white", "red")), top_annotation = ha, top_annotation_height = unit(4, "mm"), show_row_names = FALSE, show_column_names = FALSE)
column_order(ht)
##   [1]  91 236  85  86 429  88  64 311 270 271 229 231 180 257  87 227 258
##  [18] 304 309 436 431 234 148 147  31  29 450 281 500 521 499 273 272 269
##  [35] 522 520 501 519 498 518 497 505 506 504 486 485 482 483 484 502 503
##  [52] 524 527 529 528 530 532 531 525 526 523 434  61 182 176  63  83  66
##  [69] 445 230 515 517 513 296 298 516 494 492 496 493 314 297 294 295 293
##  [86] 289 292 512 507 511 510 509 291 290 514 508 448 447 444 440 396 397
## [103] 395 394 391  50  51 187 192 433 349 363 385 428 435 427 389 366 491
## [120] 401 334 459 392 402 388 457 463 446 380 351 474 372 376 489 487 537
## [137] 488 534 536 495 490 535 533 481 479 478 480 477 264 261 268 259 262
## [154] 253 267 265 266 263  65 254 256 255 318 308 320 319 315 324 326 313
## [171] 280 299 285 316 286 278 344 330 430 331 275 305 307 136 426 184 162
## [188] 167 422 421 469 341 245 160 179 153 443 449 181 149 368 186 164 141
## [205] 194 218 161  98 300 312  84 442 323 301 174 306 248 329 325 178 139
## [222] 145 142 144 185 157 249 172 177 158 183 150 170 171 152 131 134 137
## [239] 216 214 213 241 202 205 203 191 130 133 195 215 244 211 246 238 197
## [256] 224 226 225 169 538 168 539 540 219 223 220 221 222 206 204 232 242
## [273] 200 243 143 250 154 207 198 235 240 247 252 163 155 383 188 239 417
## [290] 276 406  93 303 339 109  53 124  82  26  32  23  22  20 189  18 398
## [307]  41  44  10 193 217 129   2 140 208 251 199 201 209 210 212 233 237
## [324] 288 284 420 439 274 352 287 441 328 283 282 321 317 310 279 322 332
## [341] 302 348 419 399 387 166 159 337 467 151 277 355 359 410 353 470 475
## [358] 466 175 370  42 411 374 393 404 327 456  94 432 541 156 173 146 165
## [375] 135 128 106 458 361 364 354  90 454  27 403 138 196 462 453 338 415
## [392] 407 379 371 464 461 110  79 121  45 405 425  96 102 424 423 360 452
## [409] 451 365 358  89   1  28  19 347 101 260 455  59 418 367  48  69 190
## [426] 333 132  17 378 350 468 346   9 414 409 413 412 408 116  70 122 123
## [443] 340 381 460 382  77  38 108  37 228  49  68  67 416  62  92  71   4
## [460] 103 126  36 120 127  56  74  13  12  72 104 105 100  21  30  25  33
## [477]  24 465 345 117 343 342 471 336 476 356  15  60 390 357 362  99  97
## [494] 437  39  78 438  80 375  40 113 112  52  58 400  57  55  54  76  11
## [511] 114  73  75 472   3   6 377 119  47  35 125   5  95  81  34  46  43
## [528]  16  14 386   8 473 335 111 384   7 118 373 369 107 115

Check relative log expression after batch correction

rle.custom = function (a, logged2 = TRUE, file = NULL, colorbox= NULL, labels=NULL , legend = NULL ) {
    a.m <- apply(a,1,median)
if (logged2) {
    for (i in 1:dim(a)[2]) {
         a [,i] <-  a [,i] - a.m 
    }
    } else {
        for (i in 1:dim(a)[2]) {
         a [,i] <-  log (a [,i] / a.m ) 
    }
    }
   # png(file,10240,3840)
    # par(mar=c(10,4,6,2))
    # boxplot (a, ylim= c(-5,5), outline=F, col=colorbox, xlab="pts", names=labels, las=2, cex.axis = 1.5, main="RLE", xlim = c(1,600), cex.main = 5 ) 
    # legend("bottomright",legend = c(levels(as.factor(pts.info.data$final.molec))),   
    #   fill = colorz, # 6:1 reorders so legend order matches graph
    #   title = "Legend", 
    #   cex = 5)
  #  dev.off()
    
    a.c = apply(a, 2, stats::quantile)
    return(a.c)
}

rle.medians = rle.custom(geneExpr, colorbox=colorx, file="./RLE.541.png", labels=pts.info.data$sample.nameNEW )
plot(rle.medians[3,], type="l", xlab="pts", ylab="RLE median" )

Final Gene Expression Matrix

Define design file and filter geneExpr for patients included in design data frame and

design <- pts.info.data[,c(1:2,6:8,14:17)]
rownames(design)<- design[,1]
design<- design[,-c(1:2)]
design<-na.omit(design) ### select onyl patients with all mutations data available (n=53)
design$age<- as.numeric(as.character(design$age))
design$age<- design$age - median(design$age)
design[design == "WT"] <- 0
design[design == "MUT"] <- 1
design$final.molec[design$final.molec=="AITL"] <- 0
design$final.molec[design$final.molec=="PTCL.nos"] <- 1
design$gender[design$gender=="M"] <- 1
design$gender[design$gender=="F"] <- 0
design$offset <- rep(1, nrow(design))
design<-design[,c(8,1:7)]

all(pts.info.data$sample.nameNEW == colnames(geneExpr)) ## check correspondence
## [1] TRUE
# geneExpr = geneExpr [ , order (pts.info.data$geo.id)] ### do only to set correspondence in case of custom procedure
# colnames(geneExpr) = pts.info.data$sample.nameNEW [ order (pts.info.data$geo.id)]

geneExpr2<- (geneExpr[, rownames(design)])
geneExpr2<- data.matrix(geneExpr2, rownames.force = NA)
design<- data.matrix(design, rownames.force = NA)

Model fitting

We use the lmFit function from the limma package. This comes with a whole series of powerful and reliable tests.

glm = lmFit(geneExpr2[,rownames(design)], design = design ) 
glm = eBayes(glm)
F.stat <- classifyTestsF(glm[,-1],fstat.only=TRUE)
glm$F <- as.vector(F.stat)
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
if(df2[1] > 1e6){ 
  glm$F.p.value <- pchisq(df1*glm$F,df1,lower.tail=FALSE)
}else
  glm$F.p.value <- pf(glm$F,df1,df2,lower.tail=FALSE)

set.seed(12345678)
rlm <- lmFit(geneExpr[,rownames(design)], apply(design, 2, sample))
rlm <- eBayes(rlm)
F.stat <- classifyTestsF(rlm[,-1],fstat.only=TRUE)
rlm$F <- as.vector(F.stat)
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
if(df2[1] > 1e6){ 
  rlm$F.p.value <- pchisq(df1*rlm$F,df1,lower.tail=FALSE)
}else
  rlm$F.p.value <- pf(rlm$F,df1,df2,lower.tail=FALSE)
F.stat <- classifyTestsF(glm[,2:5],fstat.only=TRUE) 
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
F.p.value <- pchisq(df1*F.stat,df1,lower.tail=FALSE)
R.stat <- classifyTestsF(rlm[,2:5],fstat.only=TRUE) 
Rall = 1 - 1/(1 + glm$F * (ncol(design)-1)/(nrow(design)-ncol(design)))
Rgenetics = 1 - 1/(1 + F.stat * 4/(nrow(design)-ncol(design)))
Pgenetics = 1 - 1/(1 + R.stat * 4/(nrow(design)-ncol(design)))
names(Rgenetics) <- names(Pgenetics) <- names(Rall) <-  rownames(geneExpr)

Differentially Expressed Genes

par(bty="n", mgp = c(2,.33,0), mar=c(3,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
d <- density(Pgenetics,bw=1e-3)
f <- 40 #nrow(gexpr)/512

par(mfrow=c(1,1))
par(mar=c(8,5,5,5), xpd=F)
plot(d$x, d$y * f, col='grey', xlab=expression(paste("Explained variance per gene ", R^2)), main="", lwd=2, type="l", ylab="", xlim=c(0,1), cex.axis=1.2, cex.lab=1.5, bty="n")
title(ylab="Density", line=2.5, cex.lab=1.5)
d <- density(Rgenetics, bw=1e-3)
r <- min(Rgenetics[p.adjust(F.p.value,"BH")<0.01]) ######## threshold to select 412 genes
x0 <- which(d$x>r)
polygon(d$x[c(x0[1],x0)], c(0,d$y[x0])* f, col=paste(set1[1],"44",sep=""), border=NA)
lines(d$x, d$y* f, col=set1[1], lwd=2)
text(d$x[x0[1]], d$y[x0[1]]*f +20, pos=4, paste(sum(Rgenetics > r), "genes q < 0.01"))
legend("topright", bty="n", col=c(set1[1], "grey"), lty=1, c("Observed","Random"), lwd=2)

glmPrediction <- glm$coefficients %*% t(design)
rlmPrediction <- rlm$coefficients %*% t(design)

Print signficiant genes

kk<-as.data.frame((p.adjust(F.p.value,"BH")<0.01))
kk$gene<- rownames(kk)
colnames(kk)[1]<-"code"
kk2<-kk[kk$code=="TRUE",]
### sort(kk2$gene) ##### if you want to print the entire list of differentially expressed genes

Significant effects per covariate

Extract the list of differentially expressed genes by mutations

### customize colors in colMutations 

colMutations = col2hex(c("magenta", "purple","gray60","red","lightblue","green","orange"))
names(colMutations) <- colnames(design)[-1]

gene_code<- kk2$gene
tab=NULL
for(i in (1:length(kk2$gene)))
{
  gene_single<- gene_code[i]
  y <- glm$coefficients[gene_single,-1]+glm$coefficients[gene_single,1]
  w <- glm$p.value[gene_single,-1] < 0.01
  int<-c(gene_single, as.character(w))
  tab<- rbind(tab, int)
}
rownames(tab)<-seq(1:nrow(tab))
colnames(tab)<- c("gene",colnames(design)[-1])

# Write to disk a file with all significant genes
#write.table(tab, "table_differentially_expressed_gene.txt",sep="\t", quote=F, row.names = F, col.names = T)

Example of extraction

  par(mfrow=c(1,1))
  par(mar=c(10,8,5,5), xpd=F)
  par(bty="n", mgp = c(1.5,.33,0),las=1, tcl=-.25, xpd=F)
  temp_name<- "YAP1"
  plot(glmPrediction[gene_single,], geneExpr[gene_single,rownames(design)], ylab="", xlab="", 
       pch=16, cex=1, cex.axis=1.2, cex.lab=1.5)
  title(ylab=(paste("Observed ",temp_name, " expression")), line=2.5, cex.lab=1.5)
   title( xlab=(paste("Predicted ",temp_name, " expression")), line=2.5, cex.lab=1.5)
  abline(0,1)
  u <- par("usr")
  par(xpd=NA)
  y <- glm$coefficients[gene_single,-1]+glm$coefficients[gene_single,1]
  u <- par("usr")
  x0 <- rep(u[3]+1,ncol(design)-1)
  y0 <- u[4] + 0.05*(u[4]-u[3]) - rank(-y)/length(y) * (u[4]-u[3])/1.2
  d <- density(y)
  lines(d$x, d$y/5+1+u[3], col="grey")
  lines(d$x, -d$y/5+1+u[3], col="grey")
points(x=y, y=x0+violinJitter(y, magnitude=0.25)$y, col=colMutations, pch=16, cex=1.5)
  text(x=glm$coefficients[gene_single,1], y= 5.2, "Model coefficients", cex=0.8)
legend("topleft",names(colMutations), col = colMutations, bty= "n", cex = 1.2, pch = 16)

Plot significant effects per covariate (q<0.05)

testResults <- decideTests(glm, method="hierarchical",adjust.method="BH", p.value=0.01)[,-1]
significantGenes <- sapply(1:ncol(testResults), function(j){
  c <- glm$coefficients[testResults[,j]!=0,j+1]
  table(cut(c, breaks=c(-5,seq(-1.5,1.5,l=7),5)))
})

colnames(significantGenes) <- colnames(testResults)
rownames(tab)<-c(1:nrow(tab))
tab2<- as.data.frame(tab)
tab2$gene<-as.character(as.character(tab2$gene))
tab2$final.molec<-as.character(as.character(tab2$final.molec))
tab2$TET2<-as.character(as.character(tab2$TET2))
tab2$RHOA<-as.character(as.character(tab2$RHOA))
tab2$IDH2<-as.character(as.character(tab2$IDH2))
tab2$DNMT3A<-as.character(as.character(tab2$DNMT3A))

par(mfrow=c(1,1))
par(bty="n", mgp = c(2.5,.33,0), mar=c(5,5.5,5,0)+.1, las=2, tcl=-.25)
b <- barplot(significantGenes, las=2, ylab = "Differentially expressed genes", col=brewer.pal(8,"RdYlBu"), legend.text=FALSE , border=0, xaxt="n", cex.lab=1.5)#, col = set1[simple.annot[names(n)]], border=NA)
rotatedLabel(x0=b-0.1, y0=rep(-0.5, ncol(significantGenes)), labels=colnames(significantGenes), cex=1.2, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(design))[-1], 1,3), col=colMutations)
rotatedLabel(b-0.1, colSums(significantGenes), colSums(significantGenes), pos=3, cex=, srt=45)#dev.off()
clip(0,30,0,1000)
x0 <- 7.5
image(x=x0+c(0,0.8), y=par("usr")[4]+seq(-1,1,l=9) -4, z=matrix(1:8, ncol=8), col=brewer.pal(8,"RdYlBu"), add=TRUE)
text(x=x0+1.1, y=par("usr")[4]+c(-1,0,1) -4, format(seq(-1,1,l=3),2), cex=0.66)
lines(x=rep(x0+0.9, 2), y=par("usr")[4]+c(-1,1) -4)
segments(x0+0.9,par("usr")[4] + 1-4,x0+0.95,par("usr")[4] + 1-4)
segments(x0+0.9,par("usr")[4] + 0-4,x0+0.95,par("usr")[4] + 0-4)
segments(x0+0.9,par("usr")[4] + -1-4,x0+0.95,par("usr")[4] + -1-4)
text(x0 + 0.45, par("usr")[4] + 1.5-4, "log2 FC", cex=.66)

Print the list of differently expressed genes using the Ensembl annotation

select_hist<- pts.info.data[pts.info.data$final.molec == "AITL" |  pts.info.data$final.molec == "PTCL.nos",]
gene<- as.data.frame(testResults)
sig_genes<- gene[gene$final.molec!= 0 |gene$IDH2 != 0 | gene$TET2 != 0 | gene$DNMT3A != 0 | gene$RHOA != 0,]
list_genes<-sort(rownames(sig_genes)) ##### list of signficiant genes
geneannotation1 <- getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",list_genes), mart = ensembl)
sort(unique(geneannotation1$external_gene_name))
##  [1] "ADRA2A"   "ARHGEF10" "C3"       "COL4A4"   "DZIP1"    "EFNB2"   
##  [7] "HS3ST3A1" "ID2"      "NETO2"    "OSMR"     "PRRX1"    "ROBO1"   
## [13] "SLC5A3"   "XKR4"     "YAP1"

Generate a heatmap with AITL, PTCL-NOS with the extracted differentially expressed genes.

gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes,]


rownames(mat) = c("AL441992.1",unique(geneannotation1$external_gene_name))

mycol= c("red","white","yellow")
mylabel = select_hist[,c("sample.nameNEW","final.molec","IDH2","RHOA","TET2","DNMT3A")]
rownames(mylabel) = mylabel$sample.nameNEW
mylabel$sample.nameNEW = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$final.molec[mylabel.col$final.molec == "AITL"] = "black"; mylabel.col$final.molec[mylabel.col$final.molec == "PTCL.nos"] = "orange"
for (a in 2:5) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )

mat  <- mat - rowMeans(mat)
par(mfrow=c(1,1))
pheatmap(mat, annotation_col = mylabel.nocol, annotation_colors = list(final.molec = c(AITL = "black", PTCL.nos = "orange"), filename= "x.pdf",
                                  IDH2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
                                  RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
                                  TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
                                  DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) , show_colnames = F, cellheight = 25,
         border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", clustering_method = "ward.D2",clustering_distance_cols = "euclidean" , silent = F)

Use ConsensusClusterPlus to extract most significant clusters

gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes,]
title=tempdir()
d<- data.matrix(mat)
d = sweep(d,1, apply(d,1,median,na.rm=T))
results = ConsensusClusterPlus(d,maxK=8,
                               pFeature=1,
                               title=title, 
                               clusterAlg="hc",
                               innerLinkage="ward.D2", 
                               finalLinkage="ward.D2", 
                               distance="euclidean", 
                               seed=123456789)
## end fraction
## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

kk<- as.data.frame((results[[4]]$consensusClass)) ##### 4 significant cluster
kk$geo.id<- rownames(kk)
colnames(kk)[1]<- "cluster"
table(kk$cluster)
## 
##   1   2   3   4 
##  69 107  23  72

Plot heatmap according to the 4-cluster stratification.

## add ConsensusCluster label, samples orderedaccordingly
heat<- merge(t(mat), kk, by.x = 0, by.y="geo.id")
heat2<- merge(heat, pts.info.data, by.x = 1, by.y="sample.nameNEW")
heat2<- heat2[order(heat2$cluster),]
mycol= c("red","white","yellow")
mylabel = heat2[,c("Row.names","cluster","final.molec","TET2","RHOA","IDH2","DNMT3A")]
colnames(mylabel)<- c("sample.names","clusters","Histology","TET2","RHOA","IDH2","DNMT3A")
rownames(mylabel) = mylabel$sample.names
mylabel$sample.names  = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$Histology[mylabel.col$Histology == "AITL"] = "black"; mylabel.col$Histology[mylabel.col$Histology == "PTCL.nos"] = "orange"
#mylabel.col$Histology[mylabel.col$Histology == "ALCL.neg"] = "yellow"
for (a in c(3:6)) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
for (a in 1) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol_plus[1:4] )
mylabel.nocol$clusters<-as.numeric(as.character(mylabel.nocol$clusters))
mylabel.nocol$clusters<-as.character(paste("cluster",mylabel.nocol$clusters, sep=""))

# plot heatmap
par(mfrow=c(1,1))
mat3<- t(data.matrix(heat2[,2:17]))
colnames(mat3)<-heat2$Row.names
mat3= mat3[order(rownames(mat3)),]
temp_name = getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",rownames(mat3)), mart = ensembl)[,c(2:3)]
temp_name = temp_name[!duplicated(temp_name[,1]),]
rownames(mat3) = c("AL441992.1",temp_name [,2] )
mat3  <- mat3 - rowMeans(mat3)
pheatmap(mat3, annotation_col = mylabel.nocol, annotation_colors = list(clusters = c(cluster1= mycol_plus[1], cluster2 = mycol_plus[2], cluster3 = mycol_plus[3], cluster4 = mycol_plus[4]), Histology = c(AITL = "black", PTCL.nos = "orange"), IDH2 = c(MUT=mycol[1], "NA"=mycol[2],WT=mycol[3]), RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) ,  border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", cluster_cols = FALSE, show_colnames= F)

Analyze sample stratification based on the extracted differentially expressed genes betwee AILT and PTCL-nos and the ALCL ALK-negative 3-gene model.

select_hist<- pts.info.data[pts.info.data$final.molec == "AITL" | pts.info.data$final.molec == "PTCL.nos" | pts.info.data$final.molec == "ALCL.neg",]
# Add three classifier genes for ALCL ALK-neg [Agnelli et al, Blood, 2012] 
# Check on array
anaplastic_gene<- c("TNFRSF8","BATF3","TMOD1")
geneannotation2 <- getBM( attributes = c("entrezgene", "external_gene_name"), filters = "external_gene_name", values = anaplastic_gene, mart = ensembl )

anaplastic_gene_ARRAY<- paste0(geneannotation2$entrezgene, "_at")

# Append 16-gene model to 3-gene model
list_genes_all<- c(list_genes, anaplastic_gene_ARRAY)

# Redo consensus cluster analysis
gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes_all,]
title=tempdir()
d<- data.matrix(mat)
d = sweep(d,1, apply(d,1,median,na.rm=T))
results = ConsensusClusterPlus(d,maxK=8,
                               pFeature=1,
                               title=title, 
                               clusterAlg="hc",
                               innerLinkage="ward.D2", 
                               finalLinkage="ward.D2", 
                               distance="euclidean", 
                               seed=123456789)
## end fraction
## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

## clustered

kk_5<- as.data.frame((results[[5]]$consensusClass)) ##### 4 significant cluster
kk_5$geo.id<- rownames(kk_5)
colnames(kk_5)[1]<- "cluster"
table(kk_5$cluster)
## 
##   1   2   3   4   5 
##  87 103  32  63  55

Plot heatmap AITL, PTCL-NOS, ALCL-neg and the 19-gene model

heat<- merge(t(mat), kk_5, by.x = 0, by.y="geo.id")
heat2<- merge(heat, pts.info.data, by.x = 1, by.y="sample.nameNEW")
heat2<- heat2[order(heat2$cluster),]
mycol= c("red","white","yellow")
mylabel = heat2[,c("Row.names","cluster","final.molec","TET2","RHOA","IDH2","DNMT3A")]
colnames(mylabel)<- c("sample.names","clusters","Histology","TET2","RHOA","IDH2","DNMT3A")
rownames(mylabel) = mylabel$sample.names
mylabel$sample.names  = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$Histology[mylabel.col$Histology == "AITL"] = "black"; mylabel.col$Histology[mylabel.col$Histology == "PTCL.nos"] = "orange"; mylabel.col$Histology[mylabel.col$Histology == "ALCL.neg"] = "yellow"
for (a in c(3:6)) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
for (a in 1) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol_plus[1:5] )
mylabel.nocol$clusters<-as.numeric(as.character(mylabel.nocol$clusters))
mylabel.nocol$clusters<-as.character(paste("cluster",mylabel.nocol$clusters, sep=""))

par(mfrow=c(1,1))
par(mar=c(5,5,5,5), xpd=F)
mat3<- t(data.matrix(heat2[,2:20]))
colnames(mat3)<-heat2$Row.names
mat3= mat3[order(rownames(mat3)),]
temp_name = getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",rownames(mat3)), mart = ensembl)[,c(2:3)]
temp_name = temp_name[!duplicated(temp_name[,1]),]
rownames(mat3) = c("AL441992.1",temp_name [,2] )
mat3  <- mat3 - rowMeans(mat3)
par(mfrow=c(1,1))
pheatmap(mat3, annotation_col = mylabel.nocol, annotation_colors = list(clusters = c(cluster1= mycol_plus[1], cluster2 = mycol_plus[2], cluster3 = mycol_plus[3], cluster4 = mycol_plus[4], cluster5 = mycol_plus[5]), Histology = c(AITL = "black", PTCL.nos = "orange", ALCL.neg= "yellow"), IDH2 = c(MUT=mycol[1], "NA"=mycol[2],WT=mycol[3]), RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) ,  border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", cluster_cols = FALSE, show_colnames= F, row_annotation =3, cellheight = 20)

LOOCV on AILT, ALCL neg, PTCLnos based on 19-gene model

y = t(mat3) 
cl.orig = c()
for (u in 1:nrow(y)) cl.orig [u] = unlist(strsplit(rownames(y)[u],"\\."))[1]

perm.mother = rownames(y)
perm.son = combn (perm.mother, length(perm.mother)-1) 

output <- cbind(perm.mother, NA)

for (i in 1:length(perm.mother)) {
  train <- y [ perm.son[,i], ] 
  test <- y [ ! ( rownames(y) %in% perm.son[,i]) , ]
  cl <- cl.orig [which(rownames(y)%in%perm.son[,i])]
  z <- lda(train, cl)
  p <- predict(z,test)$class
  output  [ setdiff(1:340, which( rownames(y) %in% perm.son[,i]) ) , 2  ] = as.character(p)
#  output  [ output[,1] == rownames(test) , 3  ] = z$scaling [1,1]
#  output  [ output[,1] == rownames(test) , 4  ] = z$scaling [2,1]
#  output  [ output[,1] == rownames(test) , 5  ] = z$scaling [3,1]
}

colnames(output) = c("true","LOOCV.predicted")
output = as.data.frame(output)
output$true.class = cl.orig

table(output$true.class, output$LOOCV.predicted  )
##       
##        AITL ALCL PTCL
##   AITL  109    1   17
##   ALCL    5   44   20
##   PTCL   13   11  120

Cibersort algorithm to characterize the tumour and microenviroment composition of each cluster

##### cibersort and origical molecular histologies

cibersort.percentages<- read.delim("cibersort.percentages.txt", sep="\t", stringsAsFactors = F, header = T)
ciber_all<-as.data.frame.matrix(t(cibersort.percentages))
ciber_all$sample.nameNEW <- rownames(ciber_all)
colnames(kk_5)[2]<-"sample.nameNEW"
require(plyr)
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## The following object is masked from 'package:IRanges':
## 
##     desc
## The following object is masked from 'package:S4Vectors':
## 
##     rename
final <-join(ciber_all, kk_5, by = "sample.nameNEW",  type="left")
final2<-merge(pts.info.data[,c(1,6,14:17)], final, by="sample.nameNEW")
final3<- subset(final2, final.molec %in% c("AITL","ALCL.neg","ALCL.pos","ATLL","NKT","PTCL.nos"))
final3<- final3[order(final3$final.molec),]
library(RColorBrewer)
n <- 22
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))


par(mar=c(2,5,7,10), xpd=TRUE)
x<- barplot(t(final3[7:28]), names.arg = rep("", length(final3$final.molec)), cex.names = 0.7, col=col_vector, border=NA,
            space=rep(0, nrow(final3)))
legend("topright",legend=colnames(final3)[7:28], col=col_vector, pch=c(15), inset=c(-0.11,0), pt.cex= 1, 
cex = 1, bty = "n",  x.intersp = 0.7)


names_hist<- unique(final3$final.molec)
col_hist<- c("orange","yellow","dodgerblue2","brown2","darkorchid1","black")
num<- as.numeric(table(final3$final.molec))
for(i in (1:length(num)))
{
  segments(x[sum(num[1:i])+1-num[i]], 1.05,x[sum(num[1:i])],1.05,lwd=4, col=col_hist[i])
  text(x[(sum(num[1:i])-num[i] +1+ sum(num[1:i]))/2], 1.1, names_hist[i], cex=1.2, srt=0) 
 
}

####################################################################################################
####
#### devided for histoogy and clusters 
####
####################################################################################################
for(i in (1:nrow(final3)))
{
final3$cluster[i][is.na(final3$cluster[i])]<- final3$final.molec[i]
}

final3$cluster <- factor(final3$cluster, levels = c( "1","2","4","NKT","3","5","ALCL.pos", "ATLL"))

final3<- final3[order(final3$cluster),]

par(mar=c(2,5,7,10), xpd=TRUE)
x<- barplot(t(final3[7:28]), names.arg = rep("", length(final3$final.molec)), cex.names = 0.7, col=col_vector, border=NA,
            space=rep(0, nrow(final3)))
legend("topright",legend=colnames(final3)[7:28], col=col_vector, pch=c(15), inset=c(-0.11,0), pt.cex= 1, 
cex = 1, bty = "n",  x.intersp = 0.7)

mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
names_hist<- c("C-1","C-2", "C-4","NKT","C-3","C-5","ALCL.pos","ATLL")
col_hist<- c(mycol_plus[1],mycol_plus[2],mycol_plus[4],"darkorchid1",mycol_plus[3],mycol_plus[5],"dodgerblue2","brown2","")
num<- as.numeric(table(final3$cluster))
  par(new=TRUE)
for(i in (1:(length(num))))
{
  segments(x[sum(num[1:i])+1-num[i]], 1.05,x[sum(num[1:i])],1.05,lwd=4, col=col_hist[i])
  text(x[(sum(num[1:i])-num[i] +1+ sum(num[1:i]))/2], 1.1, names_hist[i], cex=1.2, srt=0) 
 
}

annotation_col<- final3[,c("TET2","RHOA","IDH2","DNMT3A","final.molec","cluster")]
annotation_col[is.na(annotation_col)]<-0
rownames(annotation_col)<- final3$sample.nameNEW
colnames(annotation_col)[5]<-c("Hist")
A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col)
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
color.annot<- as.character(color.annot)
ann_colors = list(Hist=c( "AITL"=color.annot[1],"ALCL.neg"=color.annot[2],"PTCL.nos"=color.annot[6], "NKT"   =color.annot[5]  ,
                          "ALCL.pos" =color.annot[3],"ATLL"=color.annot[4]),
                  cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"4" = mycol_plus[4],"NKT" = color.annot[5],
                            "3" = mycol_plus[3],"5" = mycol_plus[5], "ALCL.pos" = color.annot[3],"ATLL" =color.annot[4] ),
                  DNMT3A = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  IDH2 = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  RHOA = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  TET2 = c("MUT"="red","WT"= "yellow2","0"="grey96")
                  )
edata<- as.matrix(((final3[,c(7:28)])))
rownames(edata)<-final3$sample.nameNEW 
library(pheatmap)
pheatmap(as.matrix( t(edata)), annotation_col=annotation_col, annotation_colors = ann_colors, 
         breaks = c(seq(0, 0.1, by= 0.001), seq(0.101, 0.2, by= 0.005),seq(0.21, 1, by= 0.01 )) , color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(204), cellheight = 20, 
         cluster_cols =  F, border_color="NA", show_colnames = F)

##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 
##### 
##### 
##### focus the analysis on AITL, PTCL-NOS and ALCL-neg
##### 
##### 
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### 

load("cibersort.all.Rdata")
ciber<-as.data.frame.matrix(t(cibersort.percentages))
ciber_select<-ciber[rownames(kk_5),]
ciber_select$sample.nameNEW<- rownames(ciber_select)
final<- merge(ciber_select, kk_5, by="sample.nameNEW")
final2<- merge(final, pts.info.data, by="sample.nameNEW")
final2<- final2[order(final2$cluster),]

annotation_col<- final2[,c("TET2","RHOA","IDH2","DNMT3A","final.molec","cluster")]
annotation_col[is.na(annotation_col)]<-0
rownames(annotation_col)<- final2$sample.nameNEW
colnames(annotation_col)[5]<-c("Hist")

A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col)
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))

mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
color.annot<- as.character(color.annot)
ann_colors = list(Hist=c( "AITL"=color.annot[1],"ALCL.neg"=color.annot[2],"PTCL.nos"=color.annot[6]),
                  cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"3" = mycol_plus[3],"4" = mycol_plus[4],"5" = mycol_plus[5]),
                  DNMT3A = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  IDH2 = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  RHOA = c("MUT"="red","WT"= "yellow2","0"="grey96"),
                  TET2 = c("MUT"="red","WT"= "yellow2","0"="grey96")
                  )
edata<- as.matrix(((final2[,c(2:23)])))
rownames(edata)<-final2$sample.nameNEW 
library(pheatmap)
pheatmap(as.matrix( t(edata)), annotation_col=annotation_col, annotation_colors = ann_colors, 
         breaks = c(seq(0, 0.1, by= 0.001), seq(0.101, 0.2, by= 0.005),seq(0.21, 1, by= 0.01 )) , color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(204), cellheight = 15,
         cluster_cols =  F, border_color="NA")

Boxplot comparing the contribution of each cibersort signature between all extracted clusters

par(mfrow=c(1,2))
par(mar=c(3,3,3,3), xpd=F)
for(i in (2:23))
{
  k<- as.numeric(final2[,i])
  table_wilk<- pairwise.wilcox.test(k,final2$cluster,p.adjust.methods = "bonferroni" )$p.value
  df_wilk <- data.frame(expand.grid(dimnames(table_wilk)),array(table_wilk))
  df_wilk2<-na.omit(df_wilk)
  df_wilk2_sig<- df_wilk2[df_wilk2$array.table_wilk.<0.05,]
  df_wilk2_sig$Var1<-as.numeric(as.character(df_wilk2_sig$Var1))
  df_wilk2_sig$Var2<-as.numeric(as.character(df_wilk2_sig$Var2))
  if(nrow(df_wilk2_sig)>0)
  {
  boxplot(k~final2$cluster, ylim=c(0,(max(k)+0.2)), main=colnames(final2)[i], cex.main=2, col=mycol_plus, las=2)
  for(j in (1:nrow(df_wilk2_sig)))
  {
    segments(df_wilk2_sig$Var1[j], max(k)-0.01+j/30, df_wilk2_sig$Var2[j],max(k)-0.01+j/30)
    p<-df_wilk2_sig$array.table_wilk.[j] 
    if(p<0.00001){p2 = "<0.00001"}else{
    p2<-as.numeric(formatC(p,digits=6,format="f"))}
    pval <- paste("p =",p2,sep=" ")
    text((df_wilk2_sig$Var1[j]+ df_wilk2_sig$Var2[j])/1.9,  max(k) +j/30, pval, cex=0.8)
  }
    }
   }

final<- read.delim("aitl_nos_alcl_clsutering.txt",sep="\t",header = T,stringsAsFactors = F)
final2<- final[,c("Row.names","hist","cluster")]

mat<- read.delim("ensembl_annotated_matrix.txt", sep="\t", stringsAsFactors = F) #### ensembl annotated gep marrix
design <- model.matrix(~ 0+factor(final2$cluster)) ##### create matrix
colnames(design)<-paste0("Cluster_",c(1:5))

contrast.matrix <- makeContrasts(Cluster_2-Cluster_1, Cluster_3-Cluster_1,Cluster_4-Cluster_1, Cluster_5-Cluster_1,
                                 Cluster_3-Cluster_2, Cluster_4-Cluster_2, Cluster_5-Cluster_2, 
                                 Cluster_4-Cluster_3, Cluster_5-Cluster_3,
                                 Cluster_4-Cluster_5,
                                 Cluster_2-(Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4,
                                 Cluster_3-(Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4,
                                 Cluster_4-(Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4,
                                 Cluster_1-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4,
                                 Cluster_5-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4,
                                 levels=design)

fit1 <- lmFit(mat, design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
fit <- eBayes(fit2)

Supervised analysis between clusters

geneExpr = adj.data
geneExpr2<- geneExpr[,colnames(geneExpr) %in% final2$Row.names ]
geneExpr2<- geneExpr2[,final2$Row.names]
ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" )
hgnc_swissprot <- getBM(attributes=c('entrezgene','hgnc_symbol','hgnc_id'),filters = 'entrezgene', 
                        values = gsub("_at","",rownames(geneExpr2)),mart = ensembl)
geneExpr3<- as.data.frame.matrix(geneExpr2[which(rownames(geneExpr2) %in% paste0(hgnc_swissprot$entrezgene,"_at")),])

levels_design<- c("Cluster_2-Cluster_1","Cluster_3-Cluster_1","Cluster_4-Cluster_1","Cluster_5-Cluster_1",
                 "Cluster_3-Cluster_2","Cluster_4-Cluster_2","Cluster_5-Cluster_2","Cluster_4-Cluster_3",
                 "Cluster_5-Cluster_3","Cluster_4-Cluster_5",
                 "Cluster_2-(Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4",
                 "Cluster_3-(Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4",
                 "Cluster_4-(Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4",
                 "Cluster_1-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4",
                 "Cluster_5-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4")
df_diff_all=NULL
for(i in (1:length(levels_design)))
{
tt <- topTable(fit, coef=i, number=Inf, genelist=rownames(geneExpr3))
tt$ID<- rownames(tt)
colnames(tt)[1]<-"GENE_SYMBOL"
head(tt, 10)
fg <- tt$GENE_SYMBOL[tt$adj.P.Val < 0.001 & abs( tt$logFC ) > 2]
length(fg)
df_diff<- cbind(fg, rep(levels_design[i], length(fg)))
df_diff_all<-rbind(df_diff_all, df_diff)
#plot(tt$logFC, -log10(tt$adj.P.Val))
}
df_diff_all<- as.data.frame.matrix(df_diff_all)

annotation_col<- final2
colnames(annotation_col)<-c("sampleID","Hist","cluster")
A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col[,-1])
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
ann_colors = list(Hist=c( "AITL"="black","ALCL"="yellow","PTCL"="orange"),
                  cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"3" = mycol_plus[3],"4" = mycol_plus[4],"5" =mycol_plus[5])
                  )

edata3<- mat[rownames(mat) %in% unique(df_diff_all$fg),]
pheatmap(as.matrix( as.matrix(edata3)), annotation_col=annotation_col, annotation_colors = ann_colors, border_color="NA", 
         scale = "row", cluster_cols = FALSE, show_colnames= F, show_rownames = FALSE)

############ table of genes

df_diff_all_tab=NULL
for(i in (1:length(levels_design)))
{
  tt <- topTable(fit, coef=i, number=Inf, genelist=rownames(geneExpr3))
  tt$ID<- rownames(tt)
  colnames(tt)[1]<-"GENE_SYMBOL"
  head(tt,10)
  fg <- tt[tt$adj.P.Val < 0.001 & abs( tt$logFC ) > 2,]
  if(nrow(fg)>0){
    fg$design<- levels_design[i]

  df_diff_all_tab<-rbind.data.frame(df_diff_all_tab, fg)
  #plot(tt$logFC"," -log10(tt$adj.P.Val))
  }
  }

nrow(df_diff_all_tab) #### number of genes differentially expressed between C-1, C-2, C-3, C-4, C-5
## [1] 668
##### list gene from Iqbal et al. blood 2014
iqbal<- unique(c("EFNB2","ROBO1","S1PR3","ANK2","LPAR1","SNAP91","SOX8","LPAR1","RAMP3","S1PR3","ROBO1","EFNB2","TUBB2B","SOX8","SOX8","ARHGEF10","DMRT1","SLC19A21","STK3","PERP","TNFRSF8","TMOD1","BATF3","CDC14B","PERP","WDFEY3","TMOD1","ATP6V0D1","AXL","CD59","CHI3L1","CLTC","COL6A1","CREG1","CTSB","CTSC","NR1","H3","PDXK","PITPNA","PLSCR1","PRDX3","CTSS","CYBB","FABP3","FPR1","FTL","GUCA2A","HCK","IFI30","IL13RA1","JAK2","LILRB1","PRKG1PSAP","SLC7A7","SOD2","TCN2","THY1","TYR","UBE2L6","WARS","AXL","FTL","SIRPA","STAT1","CSF2","IFNG","SEPT6","GATA3","CD28","STAT1","AXL","CD28","CD40","CD59","CSF2","FTL","IFNG","LILRB1","SIRPA","TBX21","MSH6","EGR1","CAT","EGR1","CAT"))

intersect(iqbal, unique(df_diff_all_tab$GENE_SYMBOL)) 
##  [1] "ROBO1"    "LPAR1"    "SOX8"     "TUBB2B"   "TNFRSF8"  "TMOD1"   
##  [7] "BATF3"    "ATP6V0D1" "CHI3L1"   "CREG1"    "CTSB"     "CTSC"    
## [13] "FTL"      "HCK"

Pairwise for pathway using tmod (https://cran.r-project.org/web/packages/tmod/vignettes/tmod.pdf)

fit1 <- lmFit(mat, design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
fit <- eBayes(fit2)
res.l <- tmodLimmaTest(fit, rownames(mat))
length(res.l)
## [1] 15
names(res.l)
##  [1] "Cluster_2 - Cluster_1"                                        
##  [2] "Cluster_3 - Cluster_1"                                        
##  [3] "Cluster_4 - Cluster_1"                                        
##  [4] "Cluster_5 - Cluster_1"                                        
##  [5] "Cluster_3 - Cluster_2"                                        
##  [6] "Cluster_4 - Cluster_2"                                        
##  [7] "Cluster_5 - Cluster_2"                                        
##  [8] "Cluster_4 - Cluster_3"                                        
##  [9] "Cluster_5 - Cluster_3"                                        
## [10] "Cluster_4 - Cluster_5"                                        
## [11] "Cluster_2 - (Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4"
## [12] "Cluster_3 - (Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4"
## [13] "Cluster_4 - (Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4"
## [14] "Cluster_1 - (Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4"
## [15] "Cluster_5 - (Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4"
pie <- tmodLimmaDecideTests(fit, genes=rownames(mat))
par(mfrow=c(1,1))
res.l2<- lapply(res.l, function(x) {x[x$adj.P.Val<10e-8,]})
tmodPanelPlot(res.l2, pie=pie, text.cex=0.6) ##### zero = grey, blue down in the first factor and red up in the first

res.l2<- lapply(res.l, function(x) {x[x$adj.P.Val>10e-8 & x$adj.P.Val<10e-5,]})
tmodPanelPlot(res.l2, pie=pie, text.cex=0.6) ##### zero = grey, blue down in the first factor and red up in the first